# AMAR ET AL. - COUNTERING MISINFORMATION EARLY (2025)
## REPLICATION FILE: 01_data_cleaning.R
# ----
# Setup ----
library(pacman)
p_load(readxl, srvyr, tidyverse, readstata13, lubridate, here)

# Baseline ----
## Load baseline data ----
baseline <- read_xlsx("data/raw/baseline/baseline_raw.xlsx", sheet = "final", col_names = FALSE, guess_max = 10000)

# Fix column names
colnames(baseline) <- baseline %>%
  slice(1:2) %>%
  t %>%
  as.data.frame %>%
  mutate(V2 = if_else(is.na(V2), V1, V2),
         V2 = str_replace_all(V2, " ", "_")) %>%
  pull(V2)

# Delete header rows
baseline <- baseline[!is.na(colnames(baseline))] %>%
  filter(!(row_number() %in% c(1, 2)))

names(baseline)
## Remove unnecessary columns ----
baseline <- baseline %>%
  select(-c(
    form_name:spd_address,
    starts_with("gp"),
    contains("inst"),
    starts_with("spd"),
    sec1_q1_comment,
    sec1_q9,
    sec3_q14,
    sec4_q1,
    sec4_q3,
    sec5_q6,
    sec5_q7,
    sec5_q8,
    matches("sec5_q10.{0,4}$"),
    matches("sec5_q13.{0,1}$"),
    sec5_q14,
    sec5_q16,
    sec8_q1
  )) %>%
  rename_with(\(x) str_replace(x, "(sec5_q10a_)(\\w{2})_(\\d)", "\\1\\3_\\2"), starts_with("sec5_q10a")) %>% # rename sec5_q10a questions (social media)
  mutate(visit_date = as_date(as.numeric(visit_date), origin = "1899-12-30")) # convert `visit_date` to date

## Merge village information ----
bimli_villages <- read_xlsx("data/raw/villages/final_villages.xlsx", sheet = "final", col_names = T, guess_max = 10000)

# create district-spillover strata variable
bimli_villages <- bimli_villages %>%
  mutate(
    district_spillover_pre =
      ifelse(!is.na(spillover_pre), 
             paste0(district_name, "_", spillover_pre),
             NA_character_))

# Fix class of `BIMLI_Village_Survey_Code`
bimli_villages <- bimli_villages %>%
  mutate(
    BIMLI_Village_Survey_Code = as.character(BIMLI_Village_Survey_Code))

# Merge baseline with village-level information
baseline <- baseline %>%
  mutate(
    BIMLI_Village_Survey_Code = case_when( # correct village codes for 11604 and 16202 (incorrect in raw data)
      BIMLI_Village_Survey_Code == "11604" ~ "11605", 
      BIMLI_Village_Survey_Code == "16202" ~ "16212",
      TRUE ~ BIMLI_Village_Survey_Code)) %>%
  left_join(select(bimli_villages, CLCDC, BIMLI_Village_Survey_Code, 
                   district_name, block_name, CLF, Gram_Panchayat, 
                   spillover_pre, district_spillover_pre, spillover_post), 
            by = join_by("BIMLI_Village_Survey_Code" == "BIMLI_Village_Survey_Code"))

## Merge village nightlights data ----
village_nightlights <- read_csv("data/raw/villages/nightlight.csv")

village_nightlights <- village_nightlights %>%
  rename(village_nightlight_viirs_mean_2021 = viirs_annual_mean2021)

baseline <- baseline %>%
  left_join(select(village_nightlights, village_code, 
                   village_nightlight_viirs_mean_2021, bjp_coalition_vote_share), 
            by = join_by("BIMLI_Village_AttendanceCode" == "village_code"))

## Party and jati data ----
# Load
jati_final <- read_csv("data/raw/baseline/jati_final.csv")

# Filter NAs before merge
jati_final <- jati_final %>% 
  filter(!is.na(child_code))

baseline <- baseline %>%
  left_join(jati_final, by = join_by("Child_BIMLI_Code" == "child_code"))

# Teacher data ----
## Load data ----
english_teachers <- read_xlsx("data/raw/teachers/english_teachers.xlsx", sheet = "final", guess_max = 10000)
media_teachers <- read_xlsx("data/raw/teachers/media_teachers.xlsx", sheet = "final", guess_max = 10000)

## Recode ----
english_teachers <- english_teachers %>%
  mutate(
    teacher_merge_code = gsub(" ", "", as.character(survey_code)),
    teacher_muslim = muslim,
    teacher_female = female,
    teacher_general = general)

media_teachers <- media_teachers %>%
  mutate(
    teacher_merge_code = gsub(" ", "", as.character(attendance_code)),
    teacher_muslim = muslim,
    teacher_female = female,
    teacher_general = general)

# Endline ----
## Load data ----
endline <- read_xlsx("data/raw/endline/endline_raw.xlsx", sheet = "final", guess_max = 12110, n_max = 12110)

## Combine randomized questions ----
# make colname-regex dictionary table for all desired columns
endline_colnames <- data.frame(
  name = colnames(endline) %>%
    str_replace("_o(\\d)+", "_") %>% # mask order identifier (on) from column names
    unique,
  regex = colnames(endline) %>%
    str_replace("_o(\\d)+", "_o.*") %>% # regex to capture all columns that are the same when order is ignored
    str_replace("q1$|q1(?=_)", "q1(?!\\\\d)") %>%
    str_c("^", ., "$") %>%
    unique
) 

# filtered dictionary table for columns to combine
cols_to_combine <- endline_colnames %>%
  filter(str_detect(name, "^sec.*__"))




combined_cols <- map(cols_to_combine$regex, 
                     \(x) {endline %>% 
                         select(matches(x, perl = TRUE)) %>% 
                         rowSums(., na.rm = TRUE)}) %>%
  set_names(cols_to_combine$name) %>%
  as.data.frame

endline <- endline %>%
  select(!matches(str_c(cols_to_combine$regex, collapse = "|"), perl = TRUE)) %>% # remove sparse columns
  select(where(\(x) !all(is.na(x)))) %>%
  bind_cols(combined_cols) # add combined columns

# order according to `endline_colnames` table
ord <- endline_colnames["name"] %>%
  inner_join(data.frame(name = colnames(endline))) %>% 
  pull

endline <- endline[ord] %>%
  rename_with(\(x) str_replace(x, "__", "_")) %>%
  mutate(Child_BIMLI_Code = str_c(village_code, str_pad(child_code, width = 2, pad = 0), sep = "."), .after = "serial_no")

## Unify sections 2 and 4 ----
# order of sections 2 and 4 was randomized
endline <- endline %>%
  mutate(random_sec2_3_4 = if_else(random_sec2_3_4 < 0.5, "2,4,3", "3,4,2"),
         across(random_sec2a:sec4a_q10, \(x) if_else(random_sec2_3_4 == "2,4,3", x, get(str_replace(cur_column(), "(sec\\d)a", "\\1b"))))) %>%
  select(-(random_sec4b_q1:sec2b_q7)) %>%
  rename_with(\(x) str_replace(x, "(sec\\d)a", "\\1")) %>%
  relocate(any_of(str_c("sec2_q", 1:10)), .after = random_sec2) %>%
  relocate(any_of(c(str_c("sec3_q", 1:11))), .after = sec2_q10)

## Rearrange columns ----
endline <- endline %>%
  # remove unnecessary columns
  select(-c(
    contains("inst"),
    #contains("random"),
    contains("photo"),
    any_of(c("sec4_q1",
             "sec4_q9",
             "sec5_q1",
             "sec5_q2",
             "sec8_q11",
             "sec8_q12",
             "sec10_q11"))
  )) %>%
  # order randomized columns
  relocate(any_of(str_c("sec3_q", 1:11)), .after = "sec2_q10") %>%
  relocate(any_of(str_c("sec4_q1_", 1:5)), .after = "sec3_q11") %>%
  relocate(any_of(str_c("sec4_q9_", letters[1:5])), .after = "sec4_q7") %>%
  relocate(any_of(str_c("sec5_q1_", letters[2:18])), .after = "sec4_q10") %>%
  relocate(any_of(str_c("sec5_q2_", letters[1:6])), .after = "sec5_q1_r") %>%
  relocate(any_of(str_c("sec5_q6_", letters[1:8])), .after = "sec5_q3_e") %>%
  relocate(any_of(str_c("sec8_q11_", letters[1:11])), .after = "sec8_q8") %>%
  relocate(any_of(str_c("sec10_q11_", letters[1:11])), .after = "sec10_q14")

# Follow-up ----
## Load data ----
follow_up <- read_xlsx("data/raw/follow_up/follow_up_raw.xlsx", sheet = "final", .name_repair = "universal")

follow_up_large <- follow_up

## Combine randomized questions ----
# make colname-regex dictionary table for all desired columns
follow_up_colnames <- data.frame(
  name = colnames(follow_up_large) %>%
    str_replace("_o(\\d)+", "_") %>% # mask order identifier (on) from column names
    unique,
  regex = colnames(follow_up_large) %>%
    str_replace("_o(\\d)+", "_o.*") %>% # regex to capture all columns that are the same when order is ignored
    str_replace("q1$|q1(?=_)", "q1(?!\\\\d)") %>%
    str_c("^", ., "$") %>%
    unique) 

# filtered dictionary table for columns to combine
cols_to_combine <- follow_up_colnames %>%
  filter(str_detect(name, "^sec.*__"))

combined_cols <- map(cols_to_combine$regex, 
                     \(x) {follow_up_large %>% 
                         select(matches(x, perl = TRUE)) %>% 
                         rowSums(., na.rm = TRUE)}) %>%
  set_names(cols_to_combine$name) %>%
  as.data.frame

follow_up <- follow_up_large %>%
  select(!matches(str_c(cols_to_combine$regex, collapse = "|"), perl = TRUE)) %>% # remove sparse columns
  select(where(\(x) !all(is.na(x)))) %>%
  bind_cols(combined_cols) # add combined columns

# order according to `follow_up_colnames` table
ord <- follow_up_colnames["name"] %>%
  inner_join(data.frame(name = colnames(follow_up))) %>% 
  pull

follow_up <- follow_up[ord] %>%
  rename_with(\(x) str_replace(x, "__", "_")) #%>%
  #mutate(Child_BIMLI_Code = str_c(village_code, str_pad(child_code, width = 2, pad = 0), sep = "."), .after = "serial_no")

## Fix child code ----
follow_up <- follow_up %>%
  mutate(child_code_clean = str_replace_all(child_code, "\r\n", ""),       # Remove all "\r\n"
         child_code_clean = str_replace_all(child_code_clean, " ", ""),         # Remove all spaces
         child_code_clean = if_else(!str_detect(child_code_clean, "\\."),       # Ensure format XXXA.XX
                                    str_replace(child_code_clean, "([0-9]{3}[A-Za-z])([0-9]{2})", "\\1.\\2"), 
                                    child_code_clean)) %>%
  rowwise() %>%
  mutate(
    matches = str_detect(child_code_clean, "^[0-9]{3}[A-Za-z]\\.[0-9]{2}$")
  ) %>%
  ungroup()

## Recode (student) ----
follow_up_student <- follow_up %>%
  dplyr::filter(resp_status == 2) %>%
  dplyr::filter(!serial_no %in% c(358, 101, 102, 300, 130, 216, 224, 387, 113, 168,
                                  45, 364 , 97, 186, 200, 201, 208, 307, 329, 330, 209, 377, 179, 259,
                                  175, 2514, 2191, 5064)) %>% # Removing, duplicates
  mutate(
    follow_student_endtime = as_datetime(EndTime),
    follow_discernment1 = sec2a_q1, # Cow urine
    follow_discernment2 = sec2a_q2, # Papaya leaves
    follow_discernment3 = sec2a_q4, # Mobile towers cause cancer
    follow_discernment4 = sec2a_q5, # Snake bite exorcism
    follow_discernment5 = -sec2a_q7, # Seatbelts = good
    follow_discernment6 = -sec2a_q8, # Covid vaccine effective
    follow_discernment7 = -sec2a_q9, # handwashing prevents infection
    follow_discernment8 = -sec2a_q10, # Smoking causes cancer
    
    follow_discernment_political1 = sec2a11_q1, # Campaign funds from abroad
    follow_discernment_political2 = sec2a11_q2, # Electoral fraud
    follow_discernment_political3 = -sec2a11_q4, # BJP receives more funds
    follow_discernment_political4 = sec2a11_q5, # Modi lost seat
    
    follow_source_discern_specific1 = -sec5_q2_a, # local health worker or community health center
    follow_source_discern_specific2 = -sec5_q2_b, # government pamphlets or posters
    follow_source_discern_specific3 = -sec5_q2_c, # TV interview with AIIMS
    follow_source_discern_specific4 = sec5_q2_d, # family remedies
    follow_source_discern_specific5 = sec5_q2_e, # WhatsApp forwards
    follow_source_discern_specific6 = sec5_q2_f, # Ayurvedic doctor
    
    # SEC 5 Q1-QX still to do (tradeoffs)
    follow_tradeoffs_chose_doctor = case_when(
      !is.na(sec5_q5_1) ~ case_when(sec5_q5_1 == "a" ~ 1,
                                    sec5_q5_1 == "b" ~ 0),
      !is.na(sec5_q5_2) ~ case_when(sec5_q5_2 == "a" ~ 1,
                                    sec5_q5_2 == "b" ~ 0),
      !is.na(sec5_q5_3) ~ case_when(sec5_q5_3 == "a" ~ 1,
                                    sec5_q5_3 == "b" ~ 0),
      !is.na(sec5_q5_4) ~ case_when(sec5_q5_4 == "a" ~ 1,
                                    sec5_q5_4 == "b" ~ 0),
      !is.na(sec5_q5_5) ~ case_when(sec5_q5_5 == "a" ~ 1,
                                    sec5_q5_5 == "b" ~ 0),
      !is.na(sec5_q5_6) ~ case_when(sec5_q5_6 == "a" ~ 1,
                                    sec5_q5_6 == "b" ~ 0),
      !is.na(sec5_q5_7) ~ case_when(sec5_q5_7 == "a" ~ 1,
                                    sec5_q5_7 == "b" ~ 0),
      !is.na(sec5_q5_8) ~ case_when(sec5_q5_8 == "a" ~ 1,
                                    sec5_q5_8 == "b" ~ 0),
      TRUE ~ NA_integer_),
    follow_tradeoffs_disease = case_when(
      !is.na(sec5_q5_1) ~ "Malaria",
      !is.na(sec5_q5_2) ~ "Malaria",
      !is.na(sec5_q5_3) ~ "Malaria",
      !is.na(sec5_q5_4) ~ "Malaria",
      !is.na(sec5_q5_5) ~ "Common cold",
      !is.na(sec5_q5_6) ~ "Common cold",
      !is.na(sec5_q5_7) ~ "Common cold",
      !is.na(sec5_q5_8) ~ "Common cold",
      TRUE ~ NA_character_
    ),
    follow_tradeoffs_doctor_price = case_when(
      !is.na(sec5_q5_1) ~ "Free",
      !is.na(sec5_q5_2) ~ "Free",
      !is.na(sec5_q5_3) ~ "300 Rupees",
      !is.na(sec5_q5_4) ~ "300 Rupees",
      !is.na(sec5_q5_5) ~ "Free",
      !is.na(sec5_q5_6) ~ "Free",
      !is.na(sec5_q5_7) ~ "300 Rupees",
      !is.na(sec5_q5_8) ~ "300 Rupees",
      TRUE ~ NA_character_
    ),
    follow_tradeoffs_doctor_distance = case_when(
      !is.na(sec5_q5_1) ~ "Very close",
      !is.na(sec5_q5_2) ~ "10kms away",
      !is.na(sec5_q5_3) ~ "Very close",
      !is.na(sec5_q5_4) ~ "10kms away",
      !is.na(sec5_q5_5) ~ "Very close",
      !is.na(sec5_q5_6) ~ "10kms away",
      !is.na(sec5_q5_7) ~ "Very close",
      !is.na(sec5_q5_8) ~ "10kms away",
      TRUE ~ NA_character_
    ),
    follow_tradeoffs_combination = paste0(follow_tradeoffs_disease, ", ", follow_tradeoffs_doctor_price, ", ", follow_tradeoffs_doctor_distance),
    follow_ayurveda_effective = case_when(
      sec5_q4 == "a" ~ 0, # Ayurveda effective
      sec5_q4 == "b" ~ 1,
      sec5_q4 == "c" ~ 2,
      sec5_q4 == "d" ~ 3,
      TRUE ~ NA_integer_),
    follow_misinfo_cue = case_when( # 6. When you encounter bad quality information or info you are unsure about, what is the main factor you look for to make a decision on whether to trust it? 
      sec5_q6 == "a" ~ "Reputation of Source or Sender",
      sec5_q6 == "b" ~ "Aesthetic Cue",
      sec5_q6 == "c" ~ "Emotional Reaction",
      sec5_q6 == "d" ~ "Judgment of Others",
      TRUE ~ NA_character_),
    follow_misinfo_cue_reputation = case_when(
      sec5_q6 == "a" ~ 1,
      sec5_q6 %in% c("b", "c", "d") ~ 0,
      TRUE ~ NA_integer_),
    follow_misinfo_cue_aesthetic = case_when(
      sec5_q6 == "b" ~ 1,
      sec5_q6 %in% c("a", "c", "d") ~ 0,
      TRUE ~ NA_integer_),
    follow_misinfo_cue_emotion = case_when(
      sec5_q6 == "c" ~ 1,
      sec5_q6 %in% c("a", "b", "d") ~ 0,
      TRUE ~ NA_integer_),
    follow_misinfo_cue_judgment = case_when(
      sec5_q6 == "d" ~ 1,
      sec5_q6 %in% c("a", "b", "c") ~ 0,
      TRUE ~ NA_integer_),
    
    follow_misinfo_react = sec5_q7, # 7.	If someone you knew told you a piece of information that you know is definitely false/untrue, what would be your primary reaction to them?
    follow_misinfo_react_explain = case_when(
      grepl("a", sec5_q7) ~ 1,
      !is.na(sec5_q7) ~ 0,
      TRUE ~ NA_integer_), 
    follow_misinfo_react_teach_strategy = case_when(
      grepl("b", sec5_q7) ~ 1,
      !is.na(sec5_q7) ~ 0,
      TRUE ~ NA_integer_),
    follow_misinfo_react_admonish = case_when(
      grepl("c", sec5_q7) ~ 1,
      !is.na(sec5_q7) ~ 0,
      TRUE ~ NA_integer_),
    follow_misinfo_react_dont_share = case_when(
      grepl("d", sec5_q7) ~ 1,
      !is.na(sec5_q7) ~ 0,
      TRUE ~ NA_integer_),
    
    follow_misinfo_react_sd_explain = case_when(
      !is.na(sec5_q7) ~ (follow_misinfo_react_explain / (follow_misinfo_react_explain + follow_misinfo_react_teach_strategy +
                                                           follow_misinfo_react_admonish + follow_misinfo_react_dont_share)),
      TRUE ~ NA_integer_), 
    follow_misinfo_react_sd_teach_strategy = case_when(
      !is.na(sec5_q7) ~ (follow_misinfo_react_teach_strategy / (follow_misinfo_react_explain + follow_misinfo_react_teach_strategy +
                                                                  follow_misinfo_react_admonish + follow_misinfo_react_dont_share)),
      TRUE ~ NA_integer_), 
    follow_misinfo_react_sd_admonish = case_when(
      !is.na(sec5_q7) ~ (follow_misinfo_react_admonish / (follow_misinfo_react_explain + follow_misinfo_react_teach_strategy +
                                                            follow_misinfo_react_admonish + follow_misinfo_react_dont_share)),
      TRUE ~ NA_integer_), 
    follow_misinfo_react_sd_dont_share = case_when(
      !is.na(sec5_q7) ~ (follow_misinfo_react_dont_share / (follow_misinfo_react_explain + follow_misinfo_react_teach_strategy +
                                                              follow_misinfo_react_admonish + follow_misinfo_react_dont_share)),
      TRUE ~ NA_integer_), 
    
    follow_mechanism_bimli_only = case_when( # Think back to your experience in the classroom and please tell me which one of the following options comes closer to your experience
      sec5_q8 == "a" ~ "Learned new concepts",
      sec5_q8 == "b" ~ "Corrected misbeliefs",
      sec5_q8 == "c" ~ "Not to repeat misinformation",
      TRUE ~ NA_character_),
    follow_mechanism_bimli_only_concepts = case_when( # a. In BIMLI I learned new concepts and strategies that I can now rely on in my daily life to detect when something is false.
      sec5_q8 == "a" ~ 1,
      sec5_q8 %in% c("b", "c") ~ 0,
      TRUE ~ NA_integer_),
    follow_mechanism_bimli_only_correct_misbeliefs = case_when( # b. In BIMLI I realized that many of the things I previously believed are actually not true
      sec5_q8 == "b" ~ 1,
      sec5_q8 %in% c("a", "c") ~ 0,
      TRUE ~ NA_integer_),
    follow_mechanism_bimli_only_normative = case_when( # c. Bimli helped me realize that it is not good to repeat misinformation in public
      sec5_q8 == "c" ~ 1,
      sec5_q8 %in% c("a", "b") ~ 0,
      TRUE ~ NA_integer_),
    follow_misinfo_teacher_react = case_when( # 9. Think back to your experience in the classroom and your teacher. During the classes, if a student said a piece of information that was false/untrue, what was the primary reaction of the teacher?
      sec5_q9 == "a" ~ "Explain they are wrong",
      sec5_q9 == "b" ~ "Explain how to check veracity",
      sec5_q9 == "c" ~ "Admonish the student",
      sec5_q9 == "d" ~ "Tell them not to share misinformation",
      TRUE ~ NA_character_),
    follow_misinfo_teacher_react_explain_wrong = case_when(
      sec5_q9 == "a" ~ 1,
      sec5_q9 %in% c("b", "c", "d") ~ 0,
      TRUE ~ NA_integer_),
    follow_misinfo_teacher_react_check_veracity = case_when(
      sec5_q9 == "b" ~ 1,
      sec5_q9 %in% c("a", "c", "d") ~ 0,
      TRUE ~ NA_integer_),
    follow_misinfo_teacher_react_admonish = case_when(
      sec5_q9 == "c" ~ 1,
      sec5_q9 %in% c("a", "b", "d") ~ 0,
      TRUE ~ NA_integer_),
    follow_misinfo_teacher_react_dont_share = case_when(
      sec5_q9 == "d" ~ 1,
      sec5_q9 %in% c("a", "b", "c") ~ 0,
      TRUE ~ NA_integer_)) %>%
  mutate(
    follow_date = lubridate::as_date(StartTime),
    follow_days_from_first = as.numeric(follow_date - min(follow_date, na.rm = T)),
    follow_weeks_from_first = case_when(
      follow_days_from_first < 7 ~ "Week 1",
      follow_days_from_first >= 7 & follow_days_from_first < 14  ~ "Week 2",
      follow_days_from_first >= 14 & follow_days_from_first < 21  ~ "Week 3",
      follow_days_from_first >= 21 ~ "Week 4+",
      TRUE ~ NA_character_))

## Recode (guardian) ----
follow_up_guardian <- follow_up %>%
  dplyr::filter(resp_status == 1) %>%
  dplyr::filter(!serial_no %in% c(358, 101, 102, 300, 130, 216, 224, 387, 113, 168, 1811, 452, 4796, 2189)) %>% # Removing duplicates
  mutate(
    follow_guardian_endtime = as_datetime(EndTime),
    follow_guardian_reason_send_child = case_when(
      parents_q1 == "a" ~ "Cost free",
      parents_q1 == "b" ~ "Trust Jeevika",
      parents_q1 == "c" ~ "Wanted kid out of house",
      parents_q1 == "d" ~ "Wanted kid to learn",
      TRUE ~ NA_character_),
    follow_guardian_send_child_again = case_when(
      parents_q2 == "a" ~ 1,
      parents_q2 %in% c("b", "c") ~ 0,
      TRUE ~ NA_integer_),
    # Veracity discernment non-political
    follow_guardian_discernment1 = sec2b_q1, # Cow urine
    follow_guardian_discernment2 = sec2b_q2, # Papaya leaves
    follow_guardian_discernment3 = sec2b_q4, # Mobile towers cause cancer
    follow_guardian_discernment4 = sec2b_q5, # Snake bite exorcism
    follow_guardian_discernment5 = -sec2b_q7, # Seatbelts = good
    follow_guardian_discernment6 = -sec2b_q8, # Covid vaccine effective
    follow_guardian_discernment7 = -sec2b_q9, # handwashing prevents infection
    follow_guardian_discernment8 = -sec2b_q10, # Smoking causes cancer
    # Veracity discernment political
    follow_guardian_discernment_political1 = sec2b11_q1, # Campaign funds from abroad
    follow_guardian_discernment_political2 = sec2b11_q2, # Electoral fraud
    follow_guardian_discernment_political3 = -sec2b11_q4, # BJP receives more funds
    follow_guardian_discernment_political4 = sec2b11_q5, # Modi lost seat
    # Source discernment
    follow_guardian_source_discern_specific1 = -sec5b_q2_a, # local health worker or community health center 
    follow_guardian_source_discern_specific2 = -sec5b_q2_b, # government pamphlets or posters
    follow_guardian_source_discern_specific3 = -sec5b_q2_c, # TV interview with AIIMS
    follow_guardian_source_discern_specific4 = sec5b_q2_d, # family remedies
    follow_guardian_source_discern_specific5 = sec5b_q2_e, # WhatsApp forwards
    follow_guardian_source_discern_specific6 = sec5b_q2_f, # Ayurvedic doctor
    # Ayurveda effective
    follow_guardian_ayurveda_effective = case_when(
      parents_q6 == "a" ~ 0, # Ayurveda effective
      parents_q6 == "b" ~ 1,
      parents_q6 == "c" ~ 2,
      parents_q6 == "d" ~ 3,
      TRUE ~ NA_integer_))

replacements_mother <- c(
  "maa" = "mother",
  "mather" = "mother",
  "mummy" = "mother",
  "mata" = "mother", 
  "ma" = "mother",
  "maaa" = "mother",
  "maa ki" = "mother",
  "mothermi" = "mother",
  "mothersi" = "mother",
  "mothermi ji" = "mother",
  "mothera" = "mother",
  "mothermother ji" = "mother",
  "mother ki" = "mother",
  "mothermother" = "mother",
  "mother ji" = "mother"
)

replacements_father <- c(
  "pita" = "father",
  "papa" = "father",
  "pata" = "father",
  "papa ji" = "father", 
  "pita ji" = "father",
  "pata ji" = "father",
  "abbu" = "father", 
  "father ji" = "father"
)

replacements_sister <- c(
  "behan" = "sister",
  "bahan" = "sister" ,
  "dee" = "sister",
  "badi sister" = "sister",
  "nauadih" = "sister",
  "didi" = "sister"
)

replacements_brother <- c(
  "bhai" = "brother",
  "big brother" = "brother" ,
  "bhaiya" = "brother",
  "bare bhai" = "brother" ,
  "dee" = "brother",
  "bhaiya ji" = "brother",
  "brotherya ji" = "brother",
  "bada brother" = "brother",
  "brotherya" = "brother",
  "bare brother" = "brother",
  "bada brother" = "brother"
)

replacements_other_female <- c(
  "chachi" = "other female relatives",
  "nani" = "other female relatives" ,
  "chachi hai" = "other female relatives",
  "badri mother" = "other female relatives" ,
  "badi mother" = "other female relatives",
  "aunty" = "other female relatives",
  "chachi ji" = "other female relatives",
  "aunty ji" = "other female relatives",
  "nani ji" = "other female relatives",
  "mosi" = "other female relatives",
  "mother in law" = "other female relatives",
  "bari mother" = "other female relatives",
  "bhavi" = "other female relatives",
  "bhabhi" = "other female relatives" ,
  "bavi" = "other female relatives",
  "fufi" = "other female relatives",
  "dadi" = "other female relatives",
  "other female relatives hai" = "other female relatives",
  "other female relatives ji" = "other female relatives"
)

replacements_other_male <- c(
  "uncle" = "other male relatives",
  "chacha" = "other male relatives",
  "dada ji" = "other male relatives",
  "dadai" = "other male relatives",
  "grandfather" = "other male relatives",
  "dada" =  "other male relatives",
  "other male relatives ji" =  "other male relatives",
  "other female relatives ji" =  "other male relatives",
  "husband"  =  "other male relatives"
)

follow_up_guardian$follow_guardian_child_relation <- follow_up_guardian$child_relation %>%
  tolower() %>% # Convert to lowercase for uniformity
  trimws() %>%  # Remove leading/trailing whitespaces
  str_replace_all("[\\r\\n]+", " ") %>% # Remove newline characters
  str_squish() %>%
  str_replace_all(replacements_mother)  %>%
  str_replace_all(replacements_father)  %>%
  str_replace_all(replacements_sister) %>%
  str_replace_all(replacements_brother) %>%
  str_replace_all(replacements_other_female)  %>%
  str_replace_all(replacements_other_male)  %>%
  str_replace_all("anil paswan", "father") %>%
  str_replace_all("ravindra thakur", "father")  %>%
  str_replace_all("ravindra thakur", "father") %>%
  # Replace 'yatendra thakur' with NA
  {ifelse(. %in% c("yatendra thakur", "shakshi kumotherri", "hira devi", "sartimother devi", "beta", "sree nawal", "saurabh kumotherr", "g", "dinesh kumotherr", "sui"), NA_character_, .)}

# Attendance ----
## Load data ----
english <- read_xlsx("data/raw/attendance/control_attendance_raw.xlsx", .name_repair = "universal")
media <- read_xlsx("data/raw/attendance/treatment_attendance_raw.xlsx", .name_repair = "universal")

## Combine English and media literacy ----
english <- english %>%
  mutate(Subject = "Spoken English",
         CLCDC.CODE = as.numeric(CLCDC.CODE))
media <- media %>%
  mutate(Subject = "Media Literacy")

attendance <- bind_rows(english, media)

## Convert roll call data to binary ----
attendance <- attendance %>%
  mutate(across(starts_with("Session"), \(x) if_else(str_to_upper(x) == "PRESENT", 1, 0)))

# Clean ----
## Remove untreated village ----
# Village 112C was not part of the final (pre-randomization) sample for logistical reasons
baseline <- baseline %>%
  filter(BIMLI_Village_AttendanceCode != "112C")
endline <- endline %>%
  filter(village_name != "112C")

## Remove misaligned and duplicated IDs ----
# get misaligned IDs
codes_names <- baseline %>%
  full_join(endline, by = "Child_BIMLI_Code") %>%
  full_join(attendance, join_by("Child_BIMLI_Code" == "Child.Code")) %>%
  mutate(sec1_q3a = if_else(sec1_q3a == "a", "Male", "Female"),
         sec1_q4.x = case_match(sec1_q4.x,
                                "a" ~ "Class 8",
                                "b" ~ "Class 9",
                                "c" ~ "Class 10",
                                "d" ~ "Class 11",
                                "e" ~ "Class 12 or equivalent")) %>%
  select(Child_BIMLI_Code, BIMLI_Village_AttendanceCode, sec1_q2.x, sec1_q3a, sec1_q4.x, age, child.age, gender, child.class)

problem_codes <- codes_names %>%
  filter(sec1_q2.x != child.age | sec1_q3a != gender | sec1_q4.x != child.class) %>%
  pull(Child_BIMLI_Code)

# 132C, 133A, 177F, and 167D villages have misaligned codes

# get duplicate IDs
duplicate_codes <- codes_names %>% 
  count(Child_BIMLI_Code) %>% 
  filter(n > 1) %>%
  pull(Child_BIMLI_Code)

# additionally 142E.13 and 142E.19 refer to the same child according to the attendance data
duplicate_codes <- c(duplicate_codes, "142E.19")

duplicate_codes_names <- codes_names %>%
  filter(Child_BIMLI_Code %in% duplicate_codes)

# remove misaligned and duplicate IDs
baseline <- baseline %>%
  filter(!(Child_BIMLI_Code %in% c(problem_codes, duplicate_codes)))

endline <- endline %>%
  filter(!(Child_BIMLI_Code %in% c(problem_codes, duplicate_codes)))

attendance <- attendance %>%
  filter(!(Child.Code %in% c(problem_codes, duplicate_codes)))

## Save cleaned data ----
# baseline and attendance combined
baseline_attendance <- left_join(
  baseline, 
  select(attendance, c("Child.Code", starts_with("Session"))), 
  by = join_by("Child_BIMLI_Code" == "Child.Code"))

write_rds(baseline_attendance, "data/cleaned/baseline_attendance.rds")
write_rds(endline, "data/cleaned/endline.rds")

write_csv(baseline, "data/cleaned/baseline_attendance.csv")
write_csv(endline, "data/cleaned/endline.csv")

# Subset ----
## Subset baseline data ----
fct_case_when <- function(...) {
  # https://stackoverflow.com/questions/49572416/r-convert-to-factor-with-order-of-levels-same-with-case-when
  args <- as.list(match.call())
  levels <- sapply(args[-1], function(f) f[[3]])  # extract RHS of formula
  levels <- levels[!is.na(levels)]
  factor(dplyr::case_when(...), levels=levels)
}

# get variables of interest from baseline
baseline_subset <- baseline %>%
  mutate(
    gender = fct_case_when(
      sec1_q3a == "a" ~ "Boy",
      TRUE ~ "Girl"
    ),
    class = match(sec1_q4, letters) + 7,
    school = fct_case_when(
      sec1_q6 == "a" ~ "Government",
      sec1_q6 == "b" ~ "Private",
      sec1_q6 == "c" ~ "Other",
      sec1_q6 %in% c(88, 99) ~ "Don't know/Refused"
    ),
    religion = fct_case_when(
      sec3_q7 == "a" ~ "Hindu",
      sec3_q7 == "b" ~ "Muslim",
      sec3_q7 %in% letters[3:8] ~ "Other",
      sec3_q7 %in% c(88, 99) ~ "Don't know/Refused"
    ),
    caste = fct_case_when(
      sec3_q8 == "a" ~ "GEN",
      sec3_q8 == "b" ~ "OBC/EBC", 
      sec3_q8 == "c" ~ "SC", 
      sec3_q8 == "d" ~ "ST",
      TRUE ~ "Don't know/Refused"
    ),
    language = fct_case_when(
      sec3_q9 == "a" ~ "Hindi",
      sec3_q9 == "b" ~ "Suryapuri",
      sec3_q9 == "c" ~ "Maithili",
      sec3_q9 == "d" ~ "Bhojpuri",
      sec3_q9 == "e" ~ "Magahi",
      sec3_q9 == "f" ~ "Other",
      TRUE ~ "Don't know/Refused"
    ),
    language_hindu_num = case_when(
      sec3_q9 == "a" ~ 1,
      TRUE ~ 0
    ),
    proteins = fct_case_when(
      sec3_q11_1 == "a" ~ "Meat",
      sec3_q11_1 == "b" ~ "Eggs",
      sec3_q11_1 %in% c("a b", "b a") ~ "Both", 
      sec3_q11_1 == "c" ~ "Neither",
      TRUE ~ "Don't know/Refused"
    ),
    fathers_education = case_when(
      sec3_q12 == "k" ~ -1,
      sec3_q12 == "b" ~ 0, 
      sec3_q12 == "c" ~ 3, 
      sec3_q12 == "d" ~ 7.5,
      sec3_q12 == "e" ~ 10,
      sec3_q12 == "f" ~ 11,
      sec3_q12 == "g" ~ 12,
      sec3_q12 == "h" ~ 13,
      sec3_q12 == "i" ~ 14,
      sec3_q12 == "j" ~ 15,
      TRUE ~ NA
    ),
    mothers_education = case_when(
      sec3_q13 == "k" ~ -1,
      sec3_q13 == "b" ~ 0, 
      sec3_q13 == "c" ~ 3, 
      sec3_q13 == "d" ~ 7.5,
      sec3_q13 == "e" ~ 10,
      sec3_q13 == "f" ~ 11,
      sec3_q13 == "g" ~ 12,
      sec3_q13 == "h" ~ 13,
      sec3_q13 == "i" ~ 14,
      sec3_q13 == "j" ~ 15,
      TRUE ~ NA
    ),
    own_car = fct_case_when(
      sec3_q14_1 == "a" ~ "Yes",
      sec3_q14_1 == "b" ~ "No",
      TRUE ~ "Don't know/Refused"
    ),
    own_scooter = fct_case_when(
      sec3_q14_2 == "a" ~ "Yes",
      sec3_q14_2 == "b" ~ "No",
      TRUE ~ "Don't know/Refused"
    ),
    own_ac = fct_case_when(
      sec3_q14_3 == "a" ~ "Yes",
      sec3_q14_3 == "b" ~ "No",
      TRUE ~ "Don't know/Refused"
    ),
    own_computer = fct_case_when(
      sec3_q14_4 == "a" ~ "Yes",
      sec3_q14_4 == "b" ~ "No",
      TRUE ~ "Don't know/Refused"
    ),
    own_phone = fct_case_when(
      sec3_q14_5 == "a" ~ "Yes",
      sec3_q14_5 == "b" ~ "No",
      TRUE ~ "Don't know/Refused"
    ),
    own_wifi = fct_case_when(
      sec3_q14_6 == "a" ~ "Yes",
      sec3_q14_6 == "b" ~ "No",
      TRUE ~ "Don't know/Refused"
    ),
    own_fan = fct_case_when(
      sec3_q14_7 == "a" ~ "Yes",
      sec3_q14_7 == "b" ~ "No",
      TRUE ~ "Don't know/Refused"
    ),
    own_washer = fct_case_when(
      sec3_q14_8 == "a" ~ "Yes",
      sec3_q14_8 == "b" ~ "No",
      TRUE ~ "Don't know/Refused"
    ),
    own_fridge = fct_case_when(
      sec3_q14_9 == "a" ~ "Yes",
      sec3_q14_9 == "b" ~ "No",
      TRUE ~ "Don't know/Refused"
    ),
    own_tv = fct_case_when(
      sec3_q14_10 == "a" ~ "Yes",
      sec3_q14_10 == "b" ~ "No",
      TRUE ~ "Don't know/Refused"
    ),
    own_account = fct_case_when(
      sec3_q14_11 == "a" ~ "Yes",
      sec3_q14_11 == "b" ~ "No",
      TRUE ~ "Don't know/Refused"
    ),
    own_atm = fct_case_when(
      sec3_q14_12 == "a" ~ "Yes",
      sec3_q14_12 == "b" ~ "No",
      TRUE ~ "Don't know/Refused"
    ),
    own_lpg = fct_case_when(
      sec3_q14_13 == "a" ~ "Yes",
      sec3_q14_13 == "b" ~ "No",
      TRUE ~ "Don't know/Refused"
    ),
    own_toilet = fct_case_when(
      sec3_q14_14 == "a" ~ "Yes",
      sec3_q14_14 == "b" ~ "No",
      TRUE ~ "Don't know/Refused"
    ),
    own_pumping = fct_case_when(
      sec3_q14_15 == "a" ~ "Yes",
      sec3_q14_15 == "b" ~ "No",
      TRUE ~ "Don't know/Refused"
    ),
    own_tractor = fct_case_when(
      sec3_q14_16 == "a" ~ "Yes",
      sec3_q14_16 == "b" ~ "No",
      TRUE ~ "Don't know/Refused"
    ),
    raw_asset_index = rowSums(
      cbind(own_car== "Yes", own_scooter=="Yes", own_ac=="Yes", own_computer=="Yes", own_phone == "Yes", own_wifi=="Yes", own_fan=="Yes",  own_washer == "Yes", own_fridge == "Yes", own_tv=="Yes", own_account=="Yes", own_atm=="Yes", own_lpg=="Yes", own_toilet=="Yes", own_pumping=="Yes", own_tractor=="Yes"),
      na.rm = TRUE) / 16,
    mobile_internet = fct_case_when(
      sec5_q4 == "b" ~ "Yes",
      sec5_q4 == "a" ~ "No",
      sec5_q4 %in% c(88, 99) ~ "Don't know/Refused"
    ),
    trust_newspapers = fct_case_when(
      sec5_q6_1 == "a" ~ "Believe",
      sec5_q6_1 == "b" ~ "Disbelieve",
      TRUE ~ "Don't know/Refused"
    ),
    trust_social_media = fct_case_when(
      sec5_q6_2 == "a" ~ "Believe",
      sec5_q6_2 == "b" ~ "Disbelieve",
      TRUE ~ "Don't know/Refused"
    ),
    trust_tv = fct_case_when(
      sec5_q6_3 == "a" ~ "Believe",
      sec5_q6_3 == "b" ~ "Disbelieve",
      TRUE ~ "Don't know/Refused"
    ),
    trust_friends_family = fct_case_when(
      sec5_q6_4 == "a" ~ "Believe",
      sec5_q6_4 == "b" ~ "Disbelieve",
      TRUE ~ "Don't know/Refused"
    ),
    heard_of_NDTV_India = case_when(
      sec5_q13a_1a == "1" ~ 1,
      sec5_q13a_1a == "2" ~ 0,
      TRUE ~ NA_integer_
    ),
    heard_of_DD_news = case_when(
      sec5_q13b_1 == "1" ~ 1,
      sec5_q13b_1 == "2" ~ 0,
      TRUE ~ NA_integer_
    ),
    heard_of_Republic_India = case_when(
      sec5_q13c_1 == "1" ~ 1,
      sec5_q13c_1 == "2" ~ 0,
      TRUE ~ NA_integer_
    ),
    heard_of_Zee_News = case_when(
      sec5_q13d_1 == "1" ~ 1,
      sec5_q13d_1 == "2" ~ 0,
      TRUE ~ NA_integer_
    ),
    heard_of_Aaj_News = case_when(
      sec5_q13e_1 == "1" ~ 1,
      sec5_q13e_1 == "2" ~ 0,
      TRUE ~ NA_integer_
    ),
    heard_of_News_18 = case_when(
      sec5_q13f_1 == "1" ~ 1,
      sec5_q13f_1 == "2" ~ 0,
      TRUE ~ NA_integer_
    ),
    heard_of_Dainik_Jagran = case_when(
      sec5_q13h_1 == "1" ~ 1,
      sec5_q13h_1 == "2" ~ 0,
      TRUE ~ NA_integer_
    ),
    heard_of_Hindustan = case_when(
      sec5_q13i_1 == "1" ~ 1,
      sec5_q13i_1 == "2" ~ 0,
      TRUE ~ NA_integer_
    ),
    heard_of_Dainik_Bhaskar = case_when(
      sec5_q13j_1 == "1" ~ 1,
      sec5_q13j_1 == "2" ~ 0,
      TRUE ~ NA_integer_
    ),
    news_exposure_index = as.numeric(scale(heard_of_NDTV_India + heard_of_DD_news + heard_of_Republic_India + heard_of_Zee_News + heard_of_Aaj_News + heard_of_News_18 + heard_of_Dainik_Jagran + heard_of_Hindustan + heard_of_Hindustan + heard_of_Dainik_Bhaskar)),
    vaccinated = fct_case_when(
      sec6_q1 == "a" ~ "Yes",
      sec6_q1 == "b" ~ "No",
      TRUE ~ "Don't know/Refused"
    ),
    ayurveda_effective = fct_case_when(
      sec6_q8 == "a" ~ "Effective",
      sec6_q8 == "b" ~ "Somewhat effective",
      sec6_q8 == "c" ~ "Not effective",
      TRUE ~ "Don't know/Refused"
    ),
    illness_bad_response = case_when(
      sec6_q7 %in% c("a", "b", "c", "d") ~ 1,
      sec6_q7 %in% c("e", "f") ~ 0,
      TRUE ~ NA_integer_
    ),
    science = (sec8_q1_2 == "a") + (sec8_q1_3 == "a") + (sec8_q1_5 == "a") + (sec8_q2 == "a") + (sec8_q1_1 == "b") + (sec8_q1_4 == "b") + (sec8_q1_6 == "b") + (sec8_q3 == "c"),
    jati = sec3_q8a,
    age = as.numeric(sec1_q2), 
    hindi_medium = fct_case_when(
      sec1_q5 %in% c("a", "c") ~ 0,
      sec1_q5 == "b" ~ 1,
      TRUE ~ NA_integer_
    ),
    reading = (sec1_q9_2 == "a") + (sec1_q9_2_2 == "b") 
  ) %>%
  select(Child_BIMLI_Code, BIMLI_Village_AttendanceCode, SIMON_Village_Code, Treatment, CLCDC, district_name, block_name, CLF, Gram_Panchayat, spillover_pre, district_spillover_pre, spillover_post,  national_voting_19:state_party_id, jati, jati_new,village_nightlight_viirs_mean_2021, gender:science, reading, age, hindi_medium, bjp_coalition_vote_share)

### Recode baseline data to numeric values ----
baseline_subset <- baseline_subset %>%
  mutate(
    gender_num = case_when(gender == "Girl" ~ 1,
                           TRUE ~ 0),
    # gender_num_male = case_when(gender == "Boy" ~ 1,
    #                             TRUE ~ 0),
    school_gov_num = case_when(school == "Government" ~ 1,
                               TRUE ~ 0),
    religion_hindu_num = case_when(religion == "Hindu" ~ 1,
                                   TRUE ~ 0),
    caste_gen_num = case_when(caste == "GEN" ~ 1,
                              TRUE ~ 0),
    proteins_both_num = case_when(proteins == "Both" ~ 1,
                                  TRUE ~ 0),
    own_phone_num = case_when(own_phone == "Yes" ~ 1,
                              TRUE ~ 0),
    own_washer_num = case_when(own_washer == "Yes" ~ 1,
                               TRUE ~ 0),
    own_fridge_num = case_when(own_fridge == "Yes" ~ 1,
                               TRUE ~ 0),
    mobile_internet_num = case_when(mobile_internet == "Yes" ~ 1,
                                    TRUE ~ 0),
    trust_newspapers_num = case_when(trust_newspapers == "Believe" ~ 1,
                                     TRUE ~ 0),
    trust_social_media_num = case_when(trust_social_media == "Believe" ~ 1,
                                       TRUE ~ 0),
    trust_tv_num = case_when(trust_tv == "Believe" ~ 1,
                             TRUE ~ 0),
    trust_friends_family_num = case_when(trust_friends_family == "Believe" ~ 1,
                                         TRUE ~ 0),
    vaccinated_num = case_when(vaccinated == "Yes" ~ 1,
                               TRUE ~ 0),
    media_exposure_index = as.numeric(scale(
      scale(mobile_internet_num) + scale(own_phone_num) + news_exposure_index)),
    ayurveda_effective_num = case_when(ayurveda_effective %in% c("Effective", "Somewhat effective") ~ 1,
                                       TRUE ~ 0),
    non_scientific_health_beliefs_index = as.numeric(
      scale(ayurveda_effective_num) + scale(illness_bad_response)),
    nat_el_2019_non_bjp_num = case_when(grepl("bjp", national_party_id) ~ 0,
                                        grepl("jdu", national_party_id) ~ 0,
                                        grepl("ljp", national_party_id) ~ 0,
                                        is.na(national_party_id) ~ NA_integer_,
                                        TRUE ~ 1),
    asset_index = (raw_asset_index - mean(raw_asset_index, na.rm = TRUE)) / 
      sd(raw_asset_index, na.rm = TRUE),
    state_el_2020_non_bjp_num = case_when(grepl("bjp", state_party_id) ~ 0,
                                          grepl("jdu", state_party_id) ~ 0,
                                          grepl("hum", state_party_id) ~ 0,
                                          grepl("vip", state_party_id) ~ 0,
                                          is.na(state_party_id) ~ NA_integer_,
                                          TRUE ~ 1))

## Subset teacher data ----
english_teachers_sub <- english_teachers %>%
  dplyr::select(teacher_merge_code, starts_with("teacher_")) %>%
  mutate(
    teacher_merge_code = as.character(teacher_merge_code))

media_teachers_sub <- media_teachers %>%
  dplyr::select(teacher_merge_code, starts_with("teacher_"))

teachers_sub <- rbind(english_teachers_sub, media_teachers_sub) %>%
  mutate(
    teacher_merge_code = as.character(teacher_merge_code)
  )

## Subset endline data ----
# swap valus function for sharing discernment questions
swap_values <- function(x, value1, value2) {
  v1_indicator <- x == value1
  x[x == value2] <- value1
  x[v1_indicator] <- value2
  x
}

endline_subset <- endline %>%
  mutate(
    endline_date = lubridate::as_date(StartTime),
    endline_days_from_first = as.numeric(endline_date - min(endline_date, na.rm = T)),
    endline_weeks_from_first = case_when(
      endline_days_from_first < 7 ~ "Week 1",
      endline_days_from_first >= 7 & endline_days_from_first < 14  ~ "Week 2",
      endline_days_from_first >= 14 & endline_days_from_first < 21  ~ "Week 3",
      endline_days_from_first >= 21 & endline_days_from_first < 28 ~ "Week 4",
      endline_days_from_first >= 28 & endline_days_from_first < 35 ~ "Week 5",
      endline_days_from_first >= 35 ~ "Week 6+",
      TRUE ~ NA_character_),
    bin_number = floor(endline_days_from_first / 3) + 1,
    endline_three_day_bins = ifelse(endline_days_from_first > 51, NA,
                                    paste0(
                                      bin_number, 
                                      " Day ", 
                                      floor(endline_days_from_first / 3) * 3 + 1, 
                                      "-", 
                                      floor(endline_days_from_first / 3) * 3 + 3)), 
    endline_three_day_bins = factor(
      endline_three_day_bins,
      levels = unique(paste0(
        floor(0:49 / 3) + 1, 
        " Day ", 
        floor(0:49 / 3) * 3 + 1, 
        "-", 
        floor(0:49 / 3) * 3 + 3))),
    self_reported_compliance = 5 - sec0_q1, # negate
    manipulation_check = sec0_q4, 
    # dvs
    misinfo_threat1 = sec1_q1, 
    news_manipulation1 = -sec1_q5, # negate
    news_manipulation2 = -sec1_q6, # negate
    bias2 = -sec1_q8, # negate
    bias3 = -sec1_q9, # negate
    discernment1 = sec2_q1, 
    discernment2 = sec2_q2,
    discernment4 = sec2_q4,
    discernment5 = sec2_q5,
    discernment7 = -sec2_q7, # negate
    discernment8 = -sec2_q8, # negate
    discernment9 = -sec2_q9, # negate
    discernment10 = -sec2_q10, # negate
    sharing1 = swap_values(sec3_q1, 2, 3), # swap no and maybe
    sharing2 = swap_values(sec3_q2, 2, 3), # swap no and maybe
    sharing4 = swap_values(sec3_q4, 2, 3), # swap no and maybe
    sharing5 = swap_values(sec3_q5, 2, 3), # swap no and maybe
    sharing7 = -swap_values(sec3_q7, 2, 3), # negate; swap no and maybe
    sharing8 = -swap_values(sec3_q8, 2, 3), # negate; swap no and maybe
    sharing9 = -swap_values(sec3_q9, 2, 3), # negate; swap no and maybe
    sharing10 = -swap_values(sec3_q10, 2, 3), # negate; swap no and maybe
    news_interest_health = -sec4_q1_5, # negate
    vaccine_safety1 = -sec4_q5, # negate
    vaccine_safety2 = -sec4_q6, #negate
    illness_response1 = sec4_q9_a, # traditional remedies
    illness_response2 = sec4_q9_c, # local healer
    ayurveda = sec4_q10, # ayurveda ineffective
    source_discern_generic1 = -sec5_q1_h, # MBBS doctors; negate
    source_discern_generic2 = -sec5_q1_q, # health workers such as ASHA; negate
    source_discern_generic5 = -sec5_q1_o, # Government-issued health pamphlets or posters
    source_discern_generic3 = sec5_q1_j, # jholachhap doctors
    source_discern_generic4 = sec5_q1_k, # word of mouth
    source_discern_generic6 = sec5_q1_i, # ayurvedic doctors
    source_discern_specific1 = -sec5_q2_a, # local health worker or community health center; negate
    source_discern_specific2 = -sec5_q2_b, # government pamphlets or posters; negate
    source_discern_specific3 = -sec5_q2_c, # TV interview with AIIMS doctor; negate
    source_discern_specific4 = sec5_q2_d, # family remedies
    source_discern_specific5 = sec5_q2_e, # WhatsApp forwards
    source_discern_specific6 = sec5_q2_f, # TV interview with ayurvedic doctor
    cues1 = -sec5_q3_b, # reputable media outlet; negate
    cues2 = -sec5_q3_d, # emotional or sensation tone; negate
    cues3 = sec5_q3_a, # likes or shares
    cues4 = sec5_q3_e, # same community
    engagement_attitude1a = -sec6_q1_a, # correct your friend; negate
    engagement_attitude1b = sec6_q1_e, # share with others
    engagement_attitude2 = -sec6_q2, # negate
    engagement_attitude3 = -sec6_q3, # negate
    engagement_attitude1a_raw = -sec6_q1_a, # correct your friend; negate
    engagement_attitude1b_raw = sec6_q1_e, # share with others
    engagement_attitude2_raw = -sec6_q2, # negate
    engagement_attitude3_raw = -sec6_q3, # negate
    engagement_behavior1 = -sec6_q4, # negate
    engagement_behavior2 = -sec6_q5, # negate
    engagement_behavior1_raw = -sec6_q4, # negate
    engagement_behavior2_raw = -sec6_q5, # negate
    engagement_behavior_pay_comb = -case_when(sec6_q6 == 1 ~ 0,
                                              sec6_q7 == 1 ~ 50,
                                              sec6_q8 == 1 ~ 100,
                                              sec6_q9 == 1 ~ 200,
                                              sec6_q10 == 1 ~ 500,
                                              TRUE ~ NA_integer_),
    engagement_behavior_pay_no_price = -case_when(sec6_q10 == 2 ~ 1,
                                                  sec6_q10 == 1 ~ 0,
                                                  TRUE ~ NA_integer_),
    civic_participation_collaborate = case_when(is.na(sec7_q1) ~ NA_integer_,
                                                str_detect(sec7_q1, "\\b1\\b") ~ 1,
                                                TRUE ~ 0),
    civic_participation_rally = case_when(is.na(sec7_q1) ~ NA_integer_,
                                          str_detect(sec7_q1, "\\b2\\b") ~ 1,
                                          TRUE ~ 0),
    civic_participation_party_member = case_when(is.na(sec7_q1) ~ NA_integer_,
                                                 str_detect(sec7_q1, "\\b3\\b") ~ 1,
                                                 TRUE ~ 0),
    civic_participation_volunteer = case_when(is.na(sec7_q1) ~ NA_integer_,
                                              str_detect(sec7_q1, "\\b4\\b") ~ 1,
                                              TRUE ~ 0),
    mechanism_logic = case_when(sec7_q7_a == 1 ~ 3,
                                sec7_q7_a == 2 ~ 2,
                                sec7_q7_a == 3 ~ 1,
                                TRUE ~ NA_integer_),
    mechanism_intuition = case_when(sec7_q7_b == 1 ~ 3,
                                    sec7_q7_b == 2 ~ 2,
                                    sec7_q7_b == 3 ~ 1,
                                    TRUE ~ NA_integer_),
    mechanism_others = case_when(sec7_q7_d == 1 ~ 3,
                                 sec7_q7_d == 2 ~ 2,
                                 sec7_q7_d == 3 ~ 1,
                                 TRUE ~ NA_integer_),
    mechanism_emotions = case_when(sec7_q7_f == 1 ~ 3,
                                   sec7_q7_f == 2 ~ 2,
                                   sec7_q7_f == 3 ~ 1,
                                   TRUE ~ NA_integer_),
    guardian_parent = case_when(
      sec10_q1 %in% c("a", "b") ~ 1,
      !sec10_q1 %in% c("a", "b") ~ 0,
      TRUE ~ NA_integer_),
    guardian_female = case_when(
      sec10_q1 %in% c("a", "d", "e") ~ 1,
      !sec10_q1 %in% c("a", "d", "e") ~ 0,
      TRUE ~ NA_integer_),
    guardian_ayurveda = sec10_q7,
    guardian_child_gains_when_mother_works = case_when(
      sec10_q10 == 2 ~ 1,
      !sec10_q10 == 2 ~ 0,
      TRUE ~ NA_integer_),
    guardian_woman_earns_more_positive = case_when(
      sec10_q14 == 1 ~ 0,
      sec10_q14 == 2 ~ 2,
      sec10_q14 == 3 ~ 1,
      TRUE ~ NA_integer_),
    guardian_involvement1 = case_when(
      sec11_q1 == "a" ~ 0,
      !sec11_q1 == "a" ~ 1,
      TRUE ~ NA_integer_),
    guardian_involvement2_mother = case_when(
      grepl("a", sec11_q2) ~ 1,
      !grepl("a", sec11_q2) ~ 0,
      TRUE ~ NA_integer_),
    guardian_involvement2_father = case_when(
      grepl("b", sec11_q2) ~ 1,
      !grepl("b", sec11_q2) ~ 0,
      TRUE ~ NA_integer_),
    guardian_involvement2_brother = case_when(
      grepl("c", sec11_q2) ~ 1,
      !grepl("c", sec11_q2) ~ 0,
      TRUE ~ NA_integer_),
    guardian_involvement2_sister = case_when(
      grepl("d", sec11_q2) ~ 1,
      !grepl("d", sec11_q2) ~ 0,
      TRUE ~ NA_integer_),
    guardian_involvement2_elder_female = case_when(
      grepl("e", sec11_q2) ~ 1,
      !grepl("e", sec11_q2) ~ 0,
      TRUE ~ NA_integer_),
    guardian_involvement2_elder_male = case_when(
      grepl("f", sec11_q2) ~ 1,
      !grepl("f", sec11_q2) ~ 0,
      TRUE ~ NA_integer_),
    guardian_involvement2_neighbor = case_when(
      grepl("g", sec11_q2) ~ 1,
      !grepl("g", sec11_q2) ~ 0,
      TRUE ~ NA_integer_),
    guardian_involvement2_other = case_when(
      grepl("h", sec11_q2) ~ 1,
      !grepl("h", sec11_q2) ~ 0,
      TRUE ~ NA_integer_),
    guardian_involvement3 = case_when(
      sec11_q3 == "a" ~ "a. Others present but not paying attention",
      sec11_q3 == "b" ~ "b. Others present but stayed silent",
      sec11_q3 == "c" ~ "c. Same room, interjected",
      sec11_q3 == "d" ~ "d. Same room, interjected frequently",
      TRUE ~ NA_character_)) %>%
  select(Child_BIMLI_Code, sup_code, endline_date:guardian_involvement3)

## Subset student follow-up ----
follow_up_student_sub <- follow_up_student %>%
  select(child_code_clean, follow_student_endtime:follow_weeks_from_first)

## Subset guardian follow-up ----
follow_up_guardian_sub <- follow_up_guardian %>%
  select(child_code_clean, starts_with("follow_guardian"))

# Merge ----
## Create merged data ----
# merge baseline, attendance, and endline data
bimli <- baseline_subset %>%
  left_join(select(attendance, Child.Code, starts_with("Session")), by = join_by("Child_BIMLI_Code" == "Child.Code")) %>%
  mutate(compliance = rowSums(select(., starts_with("Session"))), # Recode compliance
         treatment_uptake = case_when(
           Treatment == "Media Literacy" ~ compliance/4,
           Treatment == "Spoken English" ~ 0),
         treatment_uptake_minimal = case_when(
           Treatment == "Media Literacy" & compliance > 0 ~ 1,
           Treatment == "Media Literacy" & compliance < 1 ~ 0,
           Treatment == "Spoken English"~ 0,
           TRUE ~ NA_integer_
         ),
         treatment_num = case_when(
           Treatment == "Media Literacy" ~ 1,
           Treatment == "Spoken English" ~ 0,
           TRUE ~ NA_integer_
         ),
         compliance_category = fct_case_when(
           compliance == 0 ~ "Non-compliant",
           compliance < 4 & compliance != 0 ~ "Partially compliant",
           TRUE ~ "Fully compliant"
         ),
         compliance_minimal = case_when(
           compliance == 0 ~ 0,
           compliance %in% c(1,2,3,4) ~ 1,
           TRUE ~ NA_integer_
         ),
         compliance_full = case_when(
           compliance == 0 ~ 0,
           compliance == 4 ~ 1,
           TRUE ~ NA_integer_
         )) %>%
  mutate(  # temporarily add merge code for teacher data 
    teacher_merge_code = case_when(
      Treatment == "Media Literacy" ~ as.character(BIMLI_Village_AttendanceCode),
      Treatment == "Spoken English" ~ as.character(SIMON_Village_Code),
      TRUE ~ NA_character_)) %>%
  left_join(teachers_sub, by = join_by("teacher_merge_code" == "teacher_merge_code")) %>%
  select(-teacher_merge_code) %>%
  left_join(endline_subset, by = join_by("Child_BIMLI_Code" == "Child_BIMLI_Code")) %>%
  mutate(endline_complete = if_else(Child_BIMLI_Code %in% endline$Child_BIMLI_Code, 1, 0),
         attrition = if_else(Child_BIMLI_Code %in% endline$Child_BIMLI_Code, 0, 1)) %>% # record attrition data
  left_join(follow_up_student_sub, by = join_by("Child_BIMLI_Code" =="child_code_clean")) %>% # follow up student
  left_join(follow_up_guardian_sub, by = join_by("Child_BIMLI_Code" =="child_code_clean"))

# make treatment a factor, manipulation check, and rename variables
bimli <- bimli %>%
  mutate(Treatment = factor(Treatment, levels = c("Spoken English", "Media Literacy")),
         manipulation_check = case_when(
           Treatment == "Media Literacy" & manipulation_check == 1 ~ 1,
           Treatment == "Spoken English" & manipulation_check == 2 ~ 1,
           TRUE ~ 0)) %>%
  rename(child_code = Child_BIMLI_Code,
         village = BIMLI_Village_AttendanceCode,
         treatment = Treatment) %>%
  rename_with(\(x) str_replace_all(str_to_lower(x), "\\.", ""))

# Code follow-up timestamps
bimli <- bimli %>%
  mutate(
    follow_time_student_first = factor(case_when(
      follow_student_endtime < follow_guardian_endtime ~ 1,
      follow_student_endtime >= follow_guardian_endtime ~ 0,
      TRUE ~ NA_integer_
    ))
  )

## Replace 88, 99 with NA before creating indices ----
bimli <- bimli %>%
  mutate(across(where(is.numeric), ~ na_if(., 88))) %>%
  mutate(across(where(is.numeric), ~ na_if(., 99))) %>%
  mutate(across(where(is.numeric), ~ na_if(., -88))) %>%
  mutate(across(where(is.numeric), ~ na_if(., -99)))

## Functions to standardize outcomes ----
standardize <- function(X, sgroup = NULL){
  # https://github.com/cdsamii/make_index/blob/master/r/index_comparison.R
  if (is.null(sgroup)) sgroup <- rep(TRUE, nrow(X))
  for(j in 1:ncol(X)){
    X[,j] <- (X[,j] - mean(X[sgroup,j], na.rm = TRUE)) / sd(X[sgroup,j], na.rm = TRUE)
  }
  return(X)
}

standardize_col <- function(X, sgroup = (bimli$treatment == "Spoken English")){
  X[which(abs(X) %in% c(88, 99))] <- NA
  
  if (is.null(sgroup)) sgroup <- rep(TRUE, nrow(X))
  X <- (X - mean(X[sgroup], na.rm = TRUE)) / sd(X[sgroup], na.rm = TRUE)
  return(X)
}

icw_idx <- function(..., sgroup = (bimli$treatment == "Spoken English"), dkr.rm = TRUE) {
  # https://github.com/cdsamii/make_index/blob/master/r/index_comparison.R
  dots <- rlang::list2(...)
  X <- matrix(unlist(dots, use.names = FALSE), ncol = length(dots), byrow = FALSE)
  
  if (dkr.rm == TRUE) X[which(abs(X) %in% c(88, 99))] <- NA
  
  X <- standardize(X, sgroup)
  
  i.vec <- as.matrix(rep(1, ncol(X)))
  C <- cov(X, use = "co")
  weights <- solve(t(i.vec)%*%solve(C)%*%i.vec)%*%t(i.vec)%*%solve(C)
  # icw_values <- t(weights%*%t(X))[,1]
  # standardize(icw_values, sgroup)
  standardize_col(t(weights%*%t(X))[,1])
}

average <- function(x, y, dkr.rm = TRUE) {
  if (dkr.rm == TRUE) x[which(abs(x) %in% c(88, 99))] <- NA
  if (dkr.rm == TRUE) y[which(abs(y) %in% c(88, 99))] <- NA
  x <- standardize_col(x)
  y <- standardize_col(y)
  standardize_col((x + y) / 2)
}

difference <- function(x, y, dkr.rm = TRUE) {
  if (dkr.rm == TRUE) x[which(abs(x) %in% c(88, 99))] <- NA
  if (dkr.rm == TRUE) y[which(abs(y) %in% c(88, 99))] <- NA
  x <- standardize_col(x)
  y <- standardize_col(y)
  standardize_col(x + y) # y should already be negative
}

## Standardize questions ----
outcomes_questions_analyze_separately <- 
  c("misinfo_threat1",
    "news_manipulation1",
    "news_manipulation2",
    "news_interest_health",
    "engagement_attitude1a",
    "engagement_attitude1b",
    "engagement_attitude2",
    "engagement_attitude3",
    "engagement_behavior1",
    "engagement_behavior2")

bimli <- bimli %>%
  mutate(across(
    .cols = all_of(outcomes_questions_analyze_separately),
    .fns  = ~ standardize_col(.x, sgroup = (treatment == "Spoken English"))
  ))

## Create subindices ----
bimli <- bimli %>%
  mutate(
    news_manipulation = average(
      news_manipulation1, 
      news_manipulation2
    ),
    bias = average(
      bias2, 
      bias3
    ),
    accuracy_true = icw_idx(
      discernment7, 
      discernment8, 
      discernment9, 
      discernment10
    ),
    accuracy_false = icw_idx(
      discernment1, 
      discernment2, 
      discernment4, 
      discernment5
    ),
    sharing_true = icw_idx(
      sharing7, 
      sharing8, 
      sharing9, 
      sharing10
    ),
    sharing_false = icw_idx(
      sharing1, 
      sharing2, 
      sharing4, 
      sharing5
    ),
    accuracy_true_excl_snake = icw_idx(
      discernment7,
      discernment9, 
      discernment10
    ),
    accuracy_false_excl_snake = icw_idx(
      discernment1, 
      discernment2, 
      discernment4
    ),
    sharing_true_excl_snake = icw_idx(
      sharing7, 
      sharing9, 
      sharing10
    ),
    sharing_false_excl_snake = icw_idx(
      sharing1, 
      sharing2, 
      sharing4
    ),
    vaccine_safety = icw_idx(
      vaccine_safety1, 
      vaccine_safety2
    ),
    traditional_remedies = icw_idx(
      illness_response1, 
      illness_response2,
      ayurveda
    ),
    source_discern_generic_good = icw_idx(
      source_discern_generic1, # MBBS doctors
      source_discern_generic2, # health workers such as ASHA
      source_discern_generic5, # Government-issued health pamphlets or posters
    ), 
    source_discern_generic_bad = icw_idx(
      source_discern_generic3, # jholchaap doctors
      source_discern_generic4, # word of mouth
      source_discern_generic6 # ayurvedic doctors
    ), 
    source_discern_specific_good = icw_idx(
      source_discern_specific1, # local health worker or community health center
      source_discern_specific2, # government pamphlets or posters
      source_discern_specific3 # TV interview with AIIMS doctor
    ), 
    source_discern_specific_bad = icw_idx(
      source_discern_specific4, # family remedies
      source_discern_specific5, # WhatsApp forwards
      source_discern_specific6 # TV interview with ayurvedic doctor
    ), 
    cues_good = average(
      cues1, # reputable media outlet
      cues2 # emotional or sensational tone
    ),
    cues_index = icw_idx(
      cues1, # reputable media outlet
      cues2, # emotional or sensational tone
      cues3, # likes or shares
      cues4 # same community
    ), 
    cues_bad = average(
      cues3, # likes or shares
      cues4 # same community
    ),
    # Follow-up student
    follow_accuracy_true = icw_idx(
      follow_discernment5,
      follow_discernment6,
      follow_discernment7,
      follow_discernment8),
    follow_accuracy_false = icw_idx(
      follow_discernment1,
      follow_discernment2,
      follow_discernment3,
      follow_discernment4),
    follow_discernment_political_true = standardize_col(
      follow_discernment_political3),
    follow_discernment_political_false = icw_idx(
      follow_discernment_political1,
      follow_discernment_political2,
      follow_discernment_political4),
    follow_source_discern_specific_good = icw_idx(
      follow_source_discern_specific1,
      follow_source_discern_specific2,
      follow_source_discern_specific3),
    follow_source_discern_specific_bad = icw_idx(
      follow_source_discern_specific4,
      follow_source_discern_specific5,
      follow_source_discern_specific6),
    follow_ayurveda_effective = standardize_col(follow_ayurveda_effective),
    # Follow-up guardian
    follow_guardian_accuracy_true = icw_idx(
      follow_guardian_discernment5,
      follow_guardian_discernment6,
      follow_guardian_discernment7,
      follow_guardian_discernment8),
    follow_guardian_accuracy_false = icw_idx(
      follow_guardian_discernment1,
      follow_guardian_discernment2,
      follow_guardian_discernment3,
      follow_guardian_discernment4),
    follow_guardian_discernment_political_true = standardize_col(
      follow_guardian_discernment_political3),
    follow_guardian_discernment_political_false = icw_idx(
      follow_guardian_discernment_political1,
      follow_guardian_discernment_political2,
      follow_guardian_discernment_political4),
    follow_guardian_source_discern_specific_good = icw_idx(
      follow_guardian_source_discern_specific1,
      follow_guardian_source_discern_specific2,
      follow_guardian_source_discern_specific3),
    follow_guardian_source_discern_specific_bad = icw_idx(
      follow_guardian_source_discern_specific4,
      follow_guardian_source_discern_specific5,
      follow_guardian_source_discern_specific6),
    follow_guardian_ayurveda_effective = standardize_col(follow_guardian_ayurveda_effective))

## Standardize subindices ----
bimli <- bimli %>%
  mutate(
    news_manipulation = standardize_col(news_manipulation),
    bias = standardize_col(bias),
    accuracy_true = standardize_col(accuracy_true),
    accuracy_true_excl_snake = standardize_col(accuracy_true_excl_snake),
    accuracy_false = standardize_col(accuracy_false),
    accuracy_false_excl_snake = standardize_col(accuracy_false_excl_snake),
    sharing_true = standardize_col(sharing_true),
    sharing_true_excl_snake = standardize_col(sharing_true_excl_snake),
    sharing_false = standardize_col(sharing_false),
    sharing_false_excl_snake = standardize_col(sharing_false_excl_snake),
    vaccine_safety = standardize_col(vaccine_safety),
    traditional_remedies = standardize_col(traditional_remedies),
    source_discern_generic_good = standardize_col(source_discern_generic_good),
    source_discern_generic_bad = standardize_col(source_discern_generic_bad),
    source_discern_specific_good = standardize_col(source_discern_specific_good),
    source_discern_specific_bad = standardize_col(source_discern_specific_bad),
    cues_good = standardize_col(cues_good),
    cues_bad = standardize_col(cues_bad),
    cues_index = standardize_col(cues_index),
    mechanism_logic = standardize_col(mechanism_logic),
    mechanism_intuition = standardize_col(mechanism_intuition),
    mechanism_others = standardize_col(mechanism_others),
    mechanism_emotions = standardize_col(mechanism_emotions),
    # Guardian
    guardian_ayurveda = standardize_col(guardian_ayurveda),
    # Student follow-up
    follow_accuracy_true = standardize_col(follow_accuracy_true),
    follow_accuracy_false = standardize_col(follow_accuracy_false),
    follow_discernment_political_true = standardize_col(follow_discernment_political_true),
    follow_discernment_political_false = standardize_col(follow_discernment_political_false),
    follow_source_discern_specific_good = standardize_col(follow_source_discern_specific_good),
    follow_source_discern_specific_bad = standardize_col(follow_source_discern_specific_bad),
    # Guardian follow-up
    follow_guardian_accuracy_true = standardize_col(follow_guardian_accuracy_true),
    follow_guardian_accuracy_false = standardize_col(follow_guardian_accuracy_false),
    follow_guardian_discernment_political_true = standardize_col(follow_guardian_discernment_political_true),
    follow_guardian_discernment_political_false = standardize_col(follow_guardian_discernment_political_false),
    follow_guardian_source_discern_specific_good = standardize_col(follow_guardian_source_discern_specific_good),
    follow_guardian_source_discern_specific_bad = standardize_col(follow_guardian_source_discern_specific_bad))

## Create main indices ----
bimli <- bimli %>%
  mutate(
    awareness = icw_idx(
      misinfo_threat1, # misinfo_threat1 treated as 0 to 5 scale (codebook says 1 to 5 scale)
      news_manipulation, 
      bias
    ), 
    accuracy_discernment = difference(
      accuracy_true, 
      accuracy_false
    ),
    accuracy_discernment_excl_snake = difference(
      accuracy_true_excl_snake, 
      accuracy_false_excl_snake
    ),
    sharing_discernment = difference(
      sharing_true, 
      sharing_false
    ),
    sharing_discernment_excl_snake = difference(
      sharing_true_excl_snake, 
      sharing_false_excl_snake
    ),
    health_preferences = icw_idx(
      news_interest_health, 
      vaccine_safety, 
      traditional_remedies
    ),
    source_discernment = icw_idx(
      difference(
        source_discern_generic_good, 
        source_discern_generic_bad
      ),
      difference(
        source_discern_specific_good, 
        source_discern_specific_bad
      ),
      cues_index
    ),
    engagement_attitude = icw_idx(
      difference(
        engagement_attitude1a, 
        engagement_attitude1b
      ),
      engagement_attitude2,
      engagement_attitude3
    ),
    engagement_behavior = average(
      engagement_behavior1,
      engagement_behavior2
    ),
    engagement_behavior_pay = average(
      engagement_behavior_pay_comb,
      engagement_behavior_pay_no_price
    ),
    civic_participation = icw_idx(
      civic_participation_collaborate,
      civic_participation_rally,
      civic_participation_party_member,
      civic_participation_volunteer
    ),
    # Student follow-up
    follow_accuracy_discernment = difference(
      follow_accuracy_true,
      follow_accuracy_false
    ),
    follow_accuracy_discernment_political = difference(
      follow_discernment_political_true,
      follow_discernment_political_false
    ),
    follow_source_discernment = difference(
      follow_source_discern_specific_good,
      follow_source_discern_specific_bad),
    # Guardian follow_guardian-up
    follow_guardian_accuracy_discernment = difference(
      follow_guardian_accuracy_true,
      follow_guardian_accuracy_false
    ),
    follow_guardian_accuracy_discernment_political = difference(
      follow_guardian_discernment_political_true,
      follow_guardian_discernment_political_false
    ),
    follow_guardian_source_discernment = difference(
      follow_guardian_source_discern_specific_good,
      follow_guardian_source_discern_specific_bad))

## Standardize main indices ----
bimli <- bimli %>%
  mutate(
    awareness = standardize_col(awareness),
    accuracy_discernment = standardize_col(accuracy_discernment),
    accuracy_discernment_excl_snake = standardize_col(accuracy_discernment_excl_snake),
    sharing_discernment = standardize_col(sharing_discernment),
    sharing_discernment_excl_snake = standardize_col(sharing_discernment_excl_snake),
    health_preferences = standardize_col(health_preferences),
    source_discernment = standardize_col(source_discernment),
    engagement_attitude = standardize_col(engagement_attitude),
    engagement_behavior = standardize_col(engagement_behavior),
    # Create index of indices
    index_of_indices = icw_idx(
      awareness,
      accuracy_discernment,
      sharing_discernment,
      health_preferences,
      source_discernment,
      engagement_attitude,
      engagement_behavior
    ),
    index_of_indices = standardize_col(index_of_indices), # Standardize
    # Student follow-up
    follow_accuracy_discernment = standardize_col(follow_accuracy_discernment),
    follow_accuracy_discernment_political = standardize_col(follow_accuracy_discernment_political),
    follow_source_discernment = standardize_col(follow_source_discernment),
    follow_index_of_indices = icw_idx(
      follow_accuracy_discernment,
      follow_accuracy_discernment_political,
      follow_source_discernment
    ),
    follow_index_of_indices = standardize_col(follow_index_of_indices), # Standardize
    # Guardian follow-up
    follow_guardian_accuracy_discernment = standardize_col(follow_guardian_accuracy_discernment),
    follow_guardian_accuracy_discernment_political = standardize_col(follow_guardian_accuracy_discernment_political),
    follow_guardian_source_discernment = standardize_col(follow_guardian_source_discernment))

## Create indicator for whether student was contacted for follow-up ----
# Load randomization data
call_order_follow <- read_csv("data/raw/follow_up/randomization_follow_up.csv")

# Check if reached
call_order_follow <- call_order_follow %>%
  mutate(follow_reached = if_else(Child_BIMLI_Code %in% follow_up_student$child_code, TRUE, FALSE))

# For each village, find the highest order of a "reached" student
last_called_per_village <- call_order_follow %>%
  filter(follow_reached) %>%
  group_by(village) %>%
  summarize(last_called_order = max(call_order, na.rm = TRUE))  # Get the max call order of reached students

# Join this info back with the call_order data
call_order_follow_status <- call_order_follow %>%
  left_join(last_called_per_village, by = "village") %>%
  mutate(
    follow_call_status = case_when(
      call_order <= last_called_order ~ "Called",
      TRUE ~ "Not Called"
    )
  ) %>%
  select(Child_BIMLI_Code, follow_reached, follow_call_status) %>%
  mutate(
    follow_reached = case_when(
      follow_reached == FALSE & follow_call_status == "Called" ~ 0,
      follow_reached == TRUE & follow_call_status == "Called" ~ 1,
      TRUE ~ NA_integer_))

# Merge with main df
bimli <- bimli %>%
  left_join(call_order_follow_status, by = join_by("child_code" =="Child_BIMLI_Code"))

## Create library code and library-strata variable ----
bimli <- bimli %>%
  mutate(
    library_id = str_extract(village, "\\d+")
  )

bimli <- bimli %>%
  mutate(
    library_spillover_pre = paste0(library_id, "_", spillover_pre),
    district_spillover_pre = paste0(district_name, "_", spillover_pre))

## Re-order variables ----
bimli <- bimli %>%
  select(child_code:treatment, treatment_num, clcdc:gram_panchayat, library_id,
         spillover_pre, district_spillover_pre, library_spillover_pre,
         spillover_post, national_voting_19:state_el_2020_non_bjp_num,
         session1:mechanism_emotions, guardian_parent:guardian_involvement3,
         endline_complete, attrition, news_manipulation:cues_bad,
         awareness:civic_participation, index_of_indices,
         matches("^follow_(?!.*guardian)", perl = TRUE),
         starts_with("follow_guardian"))

## Create survey design ----
bimli_svy <- bimli %>%
  as_survey_design(ids = village)

# Save merged data ----
write_rds(bimli, "data/cleaned/merged.rds")
write_rds(bimli_svy, "data/cleaned/merged_svy.rds")
#write_csv(bimli, "data/cleaned/merged.csv")

# Clear environment ----
rm(list = ls())

# END of 01_data_cleaning.R ----